Library¶

In [15]:
import pandas as pd
import numpy as np
import gzip
import sys
sys.path.append('../../../Util')
import PreProcessingText as ppt
import BERTopicUtils as btu
import TextClustering as tc
from llama_cpp import Llama
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from bertopic.representation import MaximalMarginalRelevance, KeyBERTInspired, PartOfSpeech
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
import hdbscan
from bertopic.vectorizers import ClassTfidfTransformer
from transformers import pipeline
import torch

¶

Preprocess text to reduce time computing of SentenceTransformer¶

In [4]:
df = btu.load_data_filtered('../merged_data.zip', 'content')
270239
In [13]:
df['content'] = df['content'].apply(ppt.clean_sentences)
In [19]:
df.to_csv('content_clean_sentences.zip', index=False)
df_unique = pd.DataFrame(df['content'].unique())
df_unique.to_csv('PreProcessFiles/only_content_clean_sentences.csv', index=False, header=False)
In [20]:
df['content'] = df['content'].apply(ppt.preprocess_title)
df['content'] = df['content'].apply(ppt.remove_single_characters)
df.to_csv('PreProcessFiles/cleaned_data_name_content.csv', index=False)
df['content'].value_counts()
Out[20]:
content
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  1301
number                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             253
thanks                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             111
link                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                86
market                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              72
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  ... 
link bluefairydmt bluefairydmt link link hoffmansdream hoffmansdream link exactly right thanks guy making response easier case nt accept demand nt respond well threat let example vendor think otherwise vending platform privilege right want vendor treat customer respect fair play customer banned vendor outstanding order fret absolutely refund escrow order may affected actually get delivered vendor standard policy ensure customer receive highest quality service thank                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                1
link MushMerica MushMerica link link dcupdancer dcupdancer link link blackship blackship link link darknetanon darknetanon link link encrypt encrypt link others Let look fact shall PGP key blackship Grey match PGP key blackship empire NOT FE status empire link blackship blackship link trusted status apollon really make wonder apollon nt userphp uidblackship sale apollon month threatening leave grey go apollon umm ok bro good luck signed grey giving away free account started user bwluwug reason blackship banned clearly written profile accept vendor using wickr saw using wickr message sent customer nt yet purchased written profile funny thing reason even reviewed account messaged demanding FE saying trusted shit dumb as account violation rule threatening banned also checked see open dispute sale also checked support message many customer complaining wan na go apollon get sale cool man nt ban actually thought funny strange lot vendor saying lately want free account cause already paid apollon FE already Apollon look nt give fuck Apollon status clear using example taking thing seriously given fact everything dodgy af vending month empire demanding FE life ca nt understand apollon would give trusted status maybe mistake dunno nt really care care protecting customer vendor follow rule finally open order vendor customer dispute get coin returned otherwise coin still sent vendor hope cleared enjoy day                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              1
real life testimonial Grey Market vendor HoffmansDream want preface review saying total newbie recreational drug grew Bible Belt say drug era made adolescence college year adult life without ever trying drug except getting drugged ex girlfriend another story another day never opposed friend using recreational drug fact fraternity college used sort drug another personally nt think good idea addictive personality growing many friend told Pot legal fast forward many decade marijuana legal state chronic pain cause severe migraine tried everything nothing worked started experimenting bit MDMA several month ago wonder drug although hour wonderful totally escape pain get total relief feel good True ecstasy pun intended shame recommended every week luck would girl cut hair recommended also suffered migraine ailment described wonderful also mentioned amazing sex offered get believe tried everything described better almost like miracle drug could get pill exhausted supply suddenly moved sad day contact people drug would nt dare ask friend knew could get drug started researching extensively online could safely get MDMA read could also almost obsessive read safe drug unless tested started buying reagent kit even found could get hand buying MDMA ha bit comical think look back came across several great review Prism Reagents wow amazing company efficient place order within day reagent test first ordered Marquis Mandolin Ehrlich point nt even sure order besides MDMA time always quick ship ca nt recommend enough amazing company product NO excuse anyone drug without testing reagent kit shipping perfectly legal buy reagent kit worry ordered MDMA different people Grey Market Grey Market went week ago found different forum one Avengers Envoy Dread totally time could nt sign Dread amazing resource community came across post vendor named HoffmansDream also responded post made came across level headed ethical educational saw listing free sample pure MDMA Grey Market needed pay shipping thought give shot helpful educational information also started exchanging email gave much educational safety information MDMA drug well clear nt money thought least could post real life testimonial sample ordered Grey Market advertised mg pure MDMA pay shipping got quickly Stealth good came first class mail short day vendor nt want really communicate understand different encouraged ask question especially safety aspect drug fascinated approach tested sample got weighed actually got bit advertised sent mg tested Mandolin Marquis reagent passed flying color Mandolin went straight black Marquis went purple black photo took second Marquis Mandolin Marquis ca nt recommend enough vendor also good around guy sound like long time advocate drug safety testing promotes real sense community reminds friend knew told pot legal free put want body never million year thought dark net buying drug nt thankful vendor like many others provide service one guy buy drug help alleviate pain hour month nt buy anyone else would nt dream selling product buy use forever grateful marketplace like Grey Market Apollon wonderful forum like Dread Reddit others hope review help HoffmansDream nt personally know cheer       1
link hoffmansdream hoffmansdream link vendor since beginning highquality vendor also great community member helping grow promoting sharing positive vibe proud vendor like marketplace consider part Grey Market family vendor aspire like                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           1
Sorry happened believe multiple people thought gotten scammed long shipping time told mod                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            1
Name: count, Length: 261801, dtype: int64

Modelling¶

1° Baseline Summary: Specific Topics (Best Baseline)¶

Clustering Approach¶

  • Parameter Setting:
    • Embedding Model: all-MiniLM-L6-v2
    • Representation Model: keyBERTInspired, MaximalMarginalRelevance
    • Count Vectorizer
    • CtfIDF
    • UMAP: 20 neighbors, 8 components
    • HDBSCAN: 190 min cluster size
    • Zero-Shot Classification on cluster name
    • Outliers-reduction with Probabilites: 0.03 threshold

Clustering Results¶

  • Clusters Retrieved: 121 in which the most important ones concern:

    1. Drug sales (marijuana, cocaine, xanax, pills, meth, fentanyl,
    2. Bitcoin
    3. Scammers and seller reviews
    4. Marketplace advertising
    5. Purchase reviews
    6. Drug purchases
    7. Orders
    8. Closed sites (empire market, 
    9. Scams
    10. Sold passwords
    11. Hacker attacks
    12. Opsec questions
    13. Document and credit card forgery
    14. Chat links
  • Performance Metrics:

    • Silhouette Score: 0.60
    • Davies-Bouldin Score: 0.46
    • Coherence-Score: 0.69
    • Dos Score: 0.24
    • %Outliers: 0.35 (91k/260k)
In [7]:
df = btu.load_data_filtered('PreProcessFiles/cleaned_data_name_content.csv', 'content')
261799
In [8]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")
model = SentenceTransformer('all-MiniLM-L6-v2', device=device)
tc1 = tc.TextClustering(df, 'content')
tc1.encode_corpus(model, batch_size=64, to_tensor=False)
Using device: cpu
In [ ]:
'''
np.savez_compressed('PreProcessFiles/content_preprocessed_embeddings.npz', tc1.corpus_embeddings)
with gzip.open('PreProcessFiles/content_preprocessed_corpus.txt.gz', 'wt') as f:
    for word in tc1.corpus:
        f.write(word + '\n')
'''
In [7]:
model = SentenceTransformer('all-MiniLM-L6-v2', device=device)
In [3]:
with np.load('PreProcessFiles/content_preprocessed_embeddings.npz') as data:
    embeddings = data['arr_0']

with gzip.open('PreProcessFiles/content_preprocessed_corpus.txt.gz', 'rt') as f:
    corpus = f.read().split('\n')

corpus.pop()
Out[3]:
''
In [5]:
mmr = MaximalMarginalRelevance(diversity=0.3)
kw = KeyBERTInspired()
vectorizer_model = CountVectorizer(stop_words="english")
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
umap_model = UMAP(n_neighbors=20, n_components=8, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=190, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

topic_model = BERTopic(
    top_n_words=10, 
    n_gram_range=(1, 3),
    umap_model=umap_model, 
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model, 
    ctfidf_model=ctfidf_model, 
    representation_model=[mmr, kw],
    embedding_model=model,
    calculate_probabilities=True,
    low_memory=True,
    verbose=True
)

topics, probs = topic_model.fit_transform(corpus, embeddings)
2024-07-05 00:43:08,961 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-07-05 00:55:22,441 - BERTopic - Dimensionality - Completed ✓
2024-07-05 00:55:22,520 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-07-05 01:03:05,690 - BERTopic - Cluster - Completed ✓
2024-07-05 01:03:05,990 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-07-05 01:04:40,995 - BERTopic - Representation - Completed ✓
In [6]:
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 5))
topic_model.update_topics(corpus, vectorizer_model=vectorizer_model)
In [7]:
topic_model.get_topic_info()
Out[7]:
Topic Count Name Representation Representative_Docs
0 -1 139971 -1_vendor_nt_order_link [vendor, nt, order, link, market, like, time, ... [year using dnm found vendor like heinekenexpr...
1 0 13610 0_cart_weed_strain_thc [cart, weed, strain, thc, price, product, bud,... [general review template general information d...
2 1 5164 1_deposit_address_ticket_btc [deposit, address, ticket, btc, wallet, deposi... [missing two deposit week ago big deposit erro...
3 2 5131 2_key_pgp_pgp key_account [key, pgp, pgp key, account, message, password... [ordered item dream attempt send address vendo...
4 3 3015 3_order_shipped_day_ordered [order, shipped, day, ordered, week, marked, p... [vendor mark shipped accept order nt necessari...
... ... ... ... ... ...
117 116 212 116_acetone_water_dissolve_dry [acetone, water, dissolve, dry, solvent, filte... [acetone wash basic bare minimum lazy simple w...
118 117 208 117_witchman_link_link witchman_witchman link [witchman, link, link witchman, witchman link,... [try witchman, link rtuna rtuna link link witc...
119 118 201 118_tochka_market_tochka market_tochka tochka [tochka, market, tochka market, tochka tochka,... [tochka really, tochka good, tochka]
120 119 194 119_read_post_reading_read post [read, post, reading, read post, thread, shit,... [totally read robin williams voice great guy s...
121 120 193 120_subdread_subdreads_create_make subdread [subdread, subdreads, create, make subdread, p... [found subdread, neither subdread, tried creat...

122 rows × 5 columns

In [8]:
umap_embeddings = topic_model.umap_model.fit_transform(embeddings)
In [9]:
sihouette_davies_score = btu.calculate_silhouette_davies(umap_embeddings, topics)
coherence_score = btu.evaluate_topic_coherence(topic_model.get_topics(), corpus)
dos_score = btu.calculate_dos(topic_model.get_topics())
Silhouette Score: 0.5987827181816101
Davies-Bouldin Score: 0.46664130293567774
Coherence Model: 0.6899798666209717
Distinta Overlap Score: 0.235062999593551
In [10]:
new_topics = topic_model.reduce_outliers(corpus, topics, strategy="probabilities", probabilities=probs, threshold=0.03)
topic_model.update_topics(corpus, topics=new_topics)
topic_model.get_topic_info()
2024-07-05 01:32:18,223 - BERTopic - WARNING: Using a custom list of topic assignments may lead to errors if topic reduction techniques are used afterwards. Make sure that manually assigning topics is the last step in the pipeline.Note that topic embeddings will also be created through weightedc-TF-IDF embeddings instead of centroid embeddings.
Out[10]:
Topic Count Name Representation Representative_Docs
0 -1 91708 -1_vendor_order_nt_market [vendor, order, nt, market, link, get, like, t... [year using dnm found vendor like heinekenexpr...
1 0 14083 0_cart_weed_strain_thc [cart, weed, strain, thc, bud, price, product,... [general review template general information d...
2 1 5365 1_deposit_address_ticket_btc [deposit, address, ticket, btc, wallet, deposi... [missing two deposit week ago big deposit erro...
3 2 5696 2_key_pgp_account_pgp key [key, pgp, account, pgp key, password, message... [ordered item dream attempt send address vendo...
4 3 3368 3_order_shipped_ordered_day [order, shipped, ordered, day, week, marked, p... [vendor mark shipped accept order nt necessari...
... ... ... ... ... ...
117 116 1754 116_water_acetone_powder_dry [water, acetone, powder, dry, ml, solution, di... [acetone wash basic bare minimum lazy simple w...
118 117 218 117_witchman_link witchman_link_witchman link [witchman, link witchman, link, witchman link,... [try witchman, link rtuna rtuna link link witc...
119 118 210 118_tochka_market_tochka market_tochka tochka [tochka, market, tochka market, tochka tochka,... [tochka really, tochka good, tochka]
120 119 2589 119_post_know guy know_guy know guy_know guy [post, know guy know, guy know guy, know guy, ... [totally read robin williams voice great guy s...
121 120 509 120_subdread_sub_post_subdreads [subdread, sub, post, subdreads, create, dread... [found subdread, neither subdread, tried creat...

122 rows × 5 columns

In [11]:
btu.print_topics(topic_model, topics)
Topic 0:
[('cart', 0.006819976279333017), ('weed', 0.006434280511631705), ('strain', 0.006347524574772897), ('thc', 0.00565511355660195), ('bud', 0.005357358556622115), ('price', 0.005221536237020541), ('product', 0.005104207291030023), ('quality', 0.004796209796012732), ('shipping', 0.0045011583267441), ('high', 0.004174592956253663)]
Topic 1:
[('deposit', 0.032787925045137875), ('address', 0.015262197706025837), ('ticket', 0.014942954405397602), ('btc', 0.013042960101162), ('wallet', 0.011958076879534037), ('deposited', 0.011621786229663771), ('withdraw', 0.010362863029063115), ('withdrawal', 0.00933558444248008), ('support', 0.008322754789186544), ('empire', 0.00824793478077134)]
Topic 2:
[('key', 0.02641300395674321), ('pgp', 0.022053264456653992), ('account', 0.012948317965267183), ('pgp key', 0.012794084094563988), ('password', 0.010928399531846645), ('message', 0.010825631684437072), ('fa', 0.009133641553267683), ('encrypt', 0.00789176529994535), ('pin', 0.007279354300454491), ('decrypt', 0.006641582296102854)]
Topic 3:
[('order', 0.028750903061655165), ('shipped', 0.017771667301537914), ('ordered', 0.014115215087537993), ('day', 0.013792837195896362), ('week', 0.011825719789508337), ('marked', 0.009398320649312888), ('placed', 0.00936030713326541), ('marked shipped', 0.008314437271465129), ('monday', 0.007335242673898797), ('placed order', 0.0072314465750561085)]
Topic 4:
[('scam', 0.030207670216535733), ('scammer', 0.024388940928940277), ('scam scam', 0.015160181624773438), ('scam scam scam', 0.014703896687039586), ('scammed', 0.014384231449956375), ('scamming', 0.010244403162267237), ('vendor', 0.009295763881616164), ('nt', 0.006198805629845099), ('people', 0.006035581758898485), ('selective', 0.005245332330006293)]
Topic 5:
[('thanks', 0.032484060113625514), ('thank', 0.020008268226289372), ('lol', 0.01858082301640747), ('man', 0.01506508666913702), ('bro', 0.01354814680643686), ('good', 0.013468580049776998), ('nice', 0.010857722472134321), ('work', 0.01032501871008713), ('mate', 0.010119311243757782), ('fuck', 0.010089440008756048)]
Topic 6:
[('ship', 0.022912846582628665), ('country', 0.01909048352403851), ('eu', 0.01152845695773683), ('shipping', 0.011018795967083689), ('uk', 0.010823419658895283), ('usa', 0.01040669549033286), ('germany', 0.0092742735281788), ('international', 0.009207306185330014), ('nl', 0.009081923138059124), ('domestic', 0.00904504651788277)]
Topic 7:
[('coke', 0.03196509150656362), ('cocaine', 0.024105728506862243), ('quality', 0.006786427310805234), ('product', 0.006224607518305623), ('good', 0.006139079377307625), ('cut', 0.0059463692670827855), ('price', 0.005878671696483089), ('pure', 0.005608956759515375), ('best', 0.005571411460633695), ('link', 0.005265343185867766)]
Topic 8:
[('card', 0.04551641786998027), ('carding', 0.01958935933058771), ('cc', 0.012108596418107407), ('gift card', 0.008731997043516414), ('gift', 0.008668562268329372), ('buy', 0.006474923804008902), ('credit', 0.005923858263623121), ('cvv', 0.005712284196683973), ('dump', 0.005264099076346724), ('know', 0.005224528810605994)]
Topic 9:
[('pgp', 0.01574546581140972), ('begin pgp', 0.013133098872604033), ('begin', 0.012653097321072662), ('pgp signature', 0.01201406674160459), ('signature', 0.01155220208870939), ('end pgp', 0.008436432878705049), ('pgp signed', 0.007152002390777196), ('sha', 0.007101864353805898), ('begin pgp signature', 0.007094591077625983), ('hash sha', 0.0070938472613224545)]
Topic 10:
[('lsd', 0.044096236175307287), ('tab', 0.031201653095285848), ('ug', 0.016931316754188516), ('acid', 0.00873145849556676), ('blotter', 0.006635925340689199), ('sheet', 0.0062406804668846735), ('gg', 0.005570579662393711), ('vendor', 0.005391703453952524), ('lsd vendor', 0.005092439542171353), ('order', 0.004888302210347344)]
Topic 11:
[('vendor', 0.040480309109643724), ('good', 0.009500581513679966), ('anyone', 0.007718290266453895), ('know', 0.007581800407960307), ('legit', 0.007153009277232861), ('looking', 0.006083052727996863), ('name', 0.00606434994513405), ('good vendor', 0.005989118667240021), ('nt', 0.0054430073211006865), ('find', 0.005305649880815142)]
Topic 12:
[('dispute', 0.06090356547124397), ('refund', 0.019393067663640136), ('mod', 0.011988844923573458), ('order', 0.01088510270527193), ('moderator', 0.010282057687013528), ('vendor', 0.010214053370392403), ('day', 0.009693629309438961), ('open', 0.008613285488518007), ('refunded', 0.007447784792581303), ('resolved', 0.007391220612062599)]
Topic 13:
[('wsm', 0.059494126404102746), ('dream', 0.007982112763835541), ('market', 0.007789943938095672), ('exit', 0.007108157072916501), ('vendor', 0.007098566661040291), ('order', 0.007062707793584658), ('wsm exit', 0.005348500113356284), ('scam', 0.0051230267318945226), ('nt', 0.004657331003913841), ('exit scam', 0.00445661020792704)]
Topic 14:
[('drug', 0.006367626524558121), ('police', 0.00600917930410098), ('get', 0.005950063217179176), ('nt', 0.005855916526435605), ('house', 0.005342747778326546), ('cop', 0.005191953970332264), ('people', 0.00515934015776538), ('lawyer', 0.0046484092621795646), ('like', 0.004325926496637044), ('gun', 0.004297377651024408)]
Topic 15:
[('monero', 0.036491964872520856), ('xmr', 0.028270566843165295), ('wallet', 0.024641807217009672), ('btc', 0.021258198016140695), ('exchange', 0.013865828954017289), ('bitcoin', 0.012370315889623338), ('use', 0.010836184003036951), ('xmrto', 0.010794153696213912), ('coin', 0.010188403286916996), ('electrum', 0.007950751449379814)]
Topic 16:
[('ddos', 0.058028666646875525), ('attack', 0.020866264481397154), ('ddos attack', 0.019600803285683352), ('mirror', 0.008730480639335746), ('market', 0.00869482067886305), ('link', 0.006796974885892522), ('empire', 0.006466354814228467), ('working', 0.005511467208810047), ('site', 0.005444179305882651), ('dream', 0.004868246046230519)]
Topic 17:
[('mdma', 0.05029712586370464), ('mda', 0.013310924913221429), ('price', 0.00715570283008589), ('quality', 0.006288672436919001), ('vendor', 0.006210419219646647), ('product', 0.00594084035839644), ('good', 0.005635742979205036), ('mdma vendor', 0.005426100456726388), ('best', 0.005045285978575645), ('pure', 0.004952193362014167)]
Topic 18:
[('darknet', 0.026309787968856437), ('clearnet', 0.016485206590894863), ('dark', 0.009902619460322388), ('link', 0.008738314812911518), ('darknetmarkets', 0.007163487254635122), ('site', 0.006981642800713796), ('net', 0.00684176783829614), ('market', 0.00598010049082556), ('dark net', 0.005616050583562071), ('web', 0.005549040804871239)]
Topic 19:
[('sub', 0.011803117925385692), ('post', 0.011321615362531786), ('mod', 0.01060319978847282), ('banned', 0.00964945190302226), ('link', 0.009165912969444677), ('darkbay', 0.007970810719268799), ('dread', 0.007900190201173194), ('ban', 0.007706270881488553), ('forum', 0.007541849695559157), ('reddit', 0.0069462738518669504)]
Topic 20:
[('bar', 0.04950684022384075), ('alp', 0.019243444730683092), ('hulk', 0.00926871367650088), ('press', 0.00802935840277815), ('pack', 0.006018057507706814), ('got', 0.005817501812992099), ('like', 0.005628402468980107), ('good', 0.005618049144153402), ('vendor', 0.005227340065819387), ('review', 0.004986868721412619)]
Topic 21:
[('xanax', 0.04042229119227533), ('bar', 0.02567911728055766), ('alp', 0.01007415273358043), ('mg', 0.00851122955983346), ('alprazolam', 0.008251311666822189), ('vendor', 0.006041029608891003), ('xanax bar', 0.00593485607734136), ('pressed', 0.0055270571431095756), ('valium', 0.005323569898075285), ('press', 0.0053016848109941625)]
Topic 22:
[('market', 0.06002980864250888), ('new market', 0.009046233373357998), ('market market', 0.007931399130508242), ('good', 0.006821489801315891), ('new', 0.00677353683309554), ('like', 0.005481486353943744), ('know', 0.005076374785231827), ('nt', 0.005022050336460145), ('support', 0.004929201001346894), ('get', 0.004518778632754265)]
Topic 23:
[('feedback', 0.03444248253886573), ('review', 0.030718700245869014), ('vendor', 0.013474110484403888), ('negative', 0.011545968864024621), ('positive', 0.010253621244490108), ('rating', 0.009473312466703535), ('leave', 0.007680138605569759), ('negative feedback', 0.006307446188794368), ('good', 0.006274972711569639), ('star', 0.006199955991028031)]
Topic 24:
[('mirror', 0.12384245235492111), ('working', 0.03061880609809746), ('working mirror', 0.024663818189153267), ('mirror link', 0.015655660240129057), ('link', 0.012268617448636778), ('mirror working', 0.011867099253864992), ('main', 0.011242466432068225), ('work', 0.010766048828322067), ('main link', 0.009478112781112965), ('darkfail', 0.009382804851809609)]
Topic 25:
[('mg', 0.022552080711914627), ('pill', 0.022309527113475044), ('tablet', 0.005783347982928473), ('price', 0.005720858912482134), ('xtc', 0.005368207840580017), ('pharma', 0.005289260491337386), ('product', 0.004981281442486952), ('shipping', 0.004930315073029898), ('mdma', 0.004890800889918645), ('vendor', 0.004738208011922623)]
Topic 26:
[('box', 0.01507705645092756), ('mail', 0.01430918664442757), ('package', 0.012746248234010077), ('address', 0.011722141436383797), ('po', 0.010208143362702484), ('po box', 0.009682986485550526), ('letter', 0.008663193840642925), ('seized', 0.008541210268298533), ('name', 0.008007790133867248), ('office', 0.007319477613446309)]
Topic 27:
[('review', 0.10048344993884399), ('thanks', 0.02951239951963181), ('thanks review', 0.023346761737356704), ('review thanks', 0.018835222600696033), ('nice review', 0.018251657973384218), ('nice', 0.013878063856408933), ('thank', 0.013554866975055504), ('great review', 0.01311074447228659), ('good review', 0.011439870173907015), ('good', 0.011130253652060537)]
Topic 28:
[('ticket', 0.09398125885038333), ('support ticket', 0.03180022997175368), ('support', 0.03003295468427551), ('se en', 0.020794304523012244), ('se', 0.01990675386523344), ('en', 0.01865173567457005), ('please', 0.013618441633747431), ('open', 0.01156458692417624), ('en se', 0.011313523805040087), ('en se en', 0.011313523805040087)]
Topic 29:
[('cryptonia', 0.08473185980971933), ('market', 0.01581112282471231), ('empire', 0.015401860275082129), ('nightmare', 0.007949183375799748), ('vendor', 0.007778377885337722), ('cryptonia empire', 0.007364693237879684), ('empire cryptonia', 0.006900434175528684), ('cryptonia market', 0.006794511613805261), ('like cryptonia', 0.005367698656871601), ('good', 0.005311289636460654)]
Topic 30:
[('escrow', 0.0781073951730002), ('fe', 0.013818264015629545), ('use escrow', 0.011955179834760653), ('vendor', 0.009486694510836597), ('market', 0.009186341953505027), ('use', 0.006906515874159573), ('money', 0.006553177965252795), ('market escrow', 0.00637866495284121), ('nt', 0.006138848640713716), ('listing', 0.00584304426162554)]
Topic 31:
[('onion', 0.09360299836020991), ('dot onion', 0.01880887711801874), ('dot', 0.01796614239573166), ('onion link', 0.01258434516441126), ('onion site', 0.011604460764498814), ('link', 0.011222285262474575), ('site', 0.011164880393774598), ('onion address', 0.007933085583677923), ('darkfail', 0.006300516338983348), ('address', 0.005384343038272572)]
Topic 32:
[('det', 0.04309335455807283), ('er', 0.04124594666216744), ('og', 0.03695943524899436), ('har', 0.0360371570516066), ('jeg', 0.03540430083446446), ('som', 0.03280204038524347), ('ikke', 0.028943143491640906), ('til', 0.02340065298811956), ('en', 0.022095215091447514), ('med', 0.02133531012439731)]
Topic 33:
[('tor', 0.03945658558559981), ('browser', 0.01491449504733792), ('network', 0.012655124931726698), ('javascript', 0.012284635296644586), ('tor browser', 0.00953674138998299), ('service', 0.008430858172643482), ('false', 0.0077274905500037), ('onion', 0.007700340383689038), ('node', 0.00708076649901692), ('user', 0.006846418676208493)]
Topic 34:
[('dread', 0.09198066084924615), ('reddit', 0.010318181549445053), ('post', 0.009361658640387148), ('dread dread', 0.007770426428989851), ('sub', 0.006956986931715927), ('sub dread', 0.006054847466299628), ('see', 0.0059398655581964655), ('community', 0.005535955211242344), ('thanks', 0.005376719511828547), ('back', 0.005256578016520845)]
Topic 35:
[('meth', 0.054096945208171765), ('business day', 0.009062200685332962), ('business', 0.0067945842640072425), ('day', 0.006067556284939821), ('good', 0.0050119902847350776), ('cartel', 0.004972615658272052), ('shit', 0.004758549987559198), ('crystal', 0.004661358436249032), ('meth vendor', 0.004515987759685775), ('vendor', 0.004441003110436076)]
Topic 36:
[('fent', 0.019433370333109795), ('fentanyl', 0.016825345291432387), ('opiate', 0.012872985946302547), ('heroin', 0.0124961089851511), ('nt', 0.0048232591240900515), ('morphine', 0.004786136599314653), ('drug', 0.004635130684867345), ('get', 0.004539407344341598), ('mg', 0.004438688256617271), ('oxy', 0.0043674033803165445)]
Topic 37:
[('link', 0.12188510767712936), ('link link', 0.07034059150323892), ('point link comment', 0.04622937838291348), ('link point link', 0.04622364729136224), ('link comment post', 0.04620647481530487), ('point link', 0.04615410811523949), ('link comment', 0.04599682011097289), ('link point', 0.04558139888022394), ('comment post', 0.045508638169920936), ('ago link', 0.0451619720243576)]
Topic 38:
[('pack', 0.062394240730872894), ('week', 0.010505707539898124), ('day', 0.009133057384074897), ('ordered', 0.008982848750749488), ('land', 0.008976484898790412), ('got pack', 0.008908346931696282), ('got', 0.008490652587246303), ('landed', 0.0074370181546196575), ('waiting', 0.006736140177115034), ('order', 0.006459389948627346)]
Topic 39:
[('pm', 0.10974551148060345), ('interested', 0.053795358744441454), ('looking', 0.022600649556584196), ('find', 0.019933696889789777), ('please', 0.019186351448570265), ('need', 0.014675953291361496), ('send', 0.014498945251083627), ('one', 0.014231977151316244), ('help', 0.012602724965493774), ('know', 0.01236277381373957)]
Topic 40:
[('hugbunter', 0.11207052451511743), ('hugbunter hugbunter', 0.04973560019045556), ('link hugbunter', 0.049668720797846254), ('hugbunter link', 0.04937506683858313), ('link hugbunter hugbunter', 0.04911429929840071), ('hugbunter hugbunter link', 0.04911429929840071), ('link', 0.031585615909879), ('hug', 0.024549221359533135), ('dread', 0.014524637635109377), ('canary', 0.008243971934377406)]
Topic 41:
[('drug', 0.01418453767392391), ('police', 0.009486380671612673), ('court', 0.007974681200021958), ('enforcement', 0.0071529283190103704), ('investigation', 0.007127913563725469), ('dark web', 0.006705652916141706), ('web', 0.006642862665826218), ('law', 0.006540951963866543), ('according', 0.006345524897152996), ('dark', 0.006081634580288165)]
Topic 42:
[('stealth', 0.05794242147292298), ('good', 0.011195973935765447), ('good stealth', 0.009856837520167033), ('vendor', 0.008746592032990011), ('shipping', 0.008737094268244646), ('great', 0.006867547633212028), ('great stealth', 0.006400650556265737), ('decoy', 0.006286202253398744), ('order', 0.006174458083393117), ('pack', 0.00568591001911689)]
Topic 43:
[('counterfeit', 0.028169198429265012), ('note', 0.021509822492536572), ('euro', 0.02048359308798006), ('bill', 0.014076257312596565), ('pen', 0.010771847236603956), ('paper', 0.010546449922920973), ('pen test', 0.010492101321007637), ('printer', 0.010173848718640152), ('fake', 0.00994321682420596), ('money', 0.009447016760703992)]
Topic 44:
[('empire', 0.10987278200488068), ('nightmare', 0.012532538184170867), ('empire empire', 0.012332188961291186), ('find empire', 0.01072633716919983), ('empire nightmare', 0.009621156998352049), ('check empire', 0.006664193879110397), ('find', 0.0062419663619538235), ('support', 0.005816556174365511), ('good', 0.005802997652502144), ('empire support', 0.005663868594177585)]
Topic 45:
[('day', 0.022422881630530838), ('waiting', 0.02023388629559269), ('week', 0.020091817087124538), ('month', 0.015637580264234445), ('hour', 0.01442955058982932), ('time', 0.012989870121637425), ('wait', 0.009865748624109244), ('eta', 0.008166395971073397), ('back', 0.007413446222834265), ('minute', 0.00689201256742615)]
Topic 46:
[('id', 0.02876053122171935), ('passport', 0.028669892251663137), ('fake', 0.017484098707543427), ('license', 0.016850230994283052), ('scan', 0.015535877091556008), ('fake id', 0.011993289487938352), ('card', 0.010312365861375593), ('driver', 0.009428885438599011), ('driver license', 0.009410176764450422), ('real', 0.00929424873888463)]
Topic 47:
[('bank', 0.0375073137008834), ('account', 0.01998101852335951), ('drop', 0.014417987172734532), ('bank drop', 0.011560631553512477), ('cash', 0.011113931486260217), ('bank account', 0.010454288069825388), ('credit', 0.009878069136792548), ('card', 0.00903428766666981), ('transfer', 0.008906429450284163), ('fullz', 0.008351054193526861)]
Topic 48:
[('wickr', 0.1047383941715143), ('use wickr', 0.009668372737955552), ('using wickr', 0.008939058157942804), ('contact', 0.008297311348792595), ('via wickr', 0.007752509507272711), ('via', 0.007298030936887611), ('message', 0.00693714068912386), ('wikr', 0.006839306535852124), ('vendor', 0.006053658882027812), ('deal', 0.005951527135360606)]
Topic 49:
[('de', 0.03533548245775767), ('und', 0.019186451494402953), ('que', 0.01795068195604033), ('un', 0.01599010347128328), ('da', 0.014073688515643248), ('la', 0.014018847140051133), ('wir', 0.012592426662440342), ('et', 0.011781128175927831), ('der', 0.011325469701489298), ('die', 0.011099079522431115)]
Topic 50:
[('phishing', 0.054238417410435794), ('phishing link', 0.02988364614579535), ('link', 0.027871992824067764), ('phished', 0.0158132212423471), ('phishing site', 0.01187187594861008), ('site', 0.011613959713903999), ('darkfail', 0.00991016740535009), ('verify', 0.009252061870479035), ('mirror', 0.008562855585373006), ('link phishing', 0.008230634235483581)]
Topic 51:
[('dream', 0.08184728815591244), ('nightmare', 0.042990625726045915), ('dream dream', 0.010520471690937068), ('anyone', 0.006863330799711735), ('like', 0.006555691018472533), ('nt', 0.00600102381878209), ('think', 0.005427939218583757), ('look', 0.005324198185743176), ('since dream', 0.005203779710007445), ('like dream', 0.005194095957716491)]
Topic 52:
[('price', 0.04048576410122761), ('sale', 0.017703230473379503), ('promo', 0.014010372652010727), ('sell', 0.013448172571520544), ('buy', 0.01270576266578673), ('good', 0.012457931270005612), ('cheap', 0.009491788336025772), ('deal', 0.009239209528258087), ('selling', 0.008910691238275072), ('expensive', 0.008446940313039927)]
Topic 53:
[('tails', 0.03283439657748927), ('tail', 0.024664054739613035), ('usb', 0.0196060271561632), ('electrum', 0.019501174408836548), ('persistent', 0.013556465854670921), ('file', 0.012998669143897206), ('persistence', 0.011626213952861007), ('install', 0.010154304492519282), ('drive', 0.00992295277509104), ('whonix', 0.00847603446750536)]
Topic 54:
[('adderall', 0.02619780315309616), ('amphetamine', 0.016883432841092744), ('mg', 0.009564356698849037), ('replacement', 0.009232477335129403), ('speed', 0.007224099335767274), ('pill', 0.006184416084206999), ('ritalin', 0.005870884077791773), ('sgt', 0.0052776372214904735), ('meth', 0.005178116049965158), ('pharmacy', 0.004948789597610473)]
Topic 55:
[('cancel', 0.04017336301839129), ('order', 0.0328098179052961), ('auto', 0.021651119664660042), ('cancel order', 0.01756480619141743), ('day', 0.014751395372488901), ('cancelled', 0.014445671759800077), ('finalize', 0.01384974110035643), ('vendor', 0.012864062484886444), ('autofinalize', 0.01097140014187583), ('auto finalize', 0.010723259467223174)]
Topic 56:
[('mushroom', 0.03459704414324506), ('shrooms', 0.02004800188276594), ('cubensis', 0.01063636736126624), ('psilocybin', 0.008099215439548132), ('spore', 0.006950787534293387), ('psilocybe', 0.0066682474459377715), ('powdered', 0.006557694114051857), ('grow', 0.0061369509046591865), ('golden', 0.006073174913788356), ('gram', 0.005997655342727958)]
Topic 57:
[('ketamine', 0.06189984013363296), ('gm', 0.012458818317242949), ('gm gm', 0.0111111367574199), ('gm gm gm', 0.00925828050390188), ('vendor', 0.006716681178578894), ('shard', 0.006686958603989414), ('vial', 0.006571282911741415), ('quality', 0.006554865860877837), ('ketamine vendor', 0.0061019005665067585), ('price', 0.005980950274003181)]
Topic 58:
[('exit', 0.08343346331618515), ('exit scam', 0.05842404670714581), ('scam', 0.044540072868492306), ('exit scamming', 0.022534198362006815), ('exit scammed', 0.021031080536119972), ('scamming', 0.020171199525768832), ('scammed', 0.016001200823540753), ('market', 0.011397206992250523), ('market exit', 0.0075494539851574385), ('money', 0.00638935241187803)]
Topic 59:
[('phone', 0.03685883611493524), ('burner', 0.014645239179179202), ('sim', 0.014355380479006138), ('card', 0.011747952135566803), ('number', 0.011682767004247162), ('use', 0.010417111593864077), ('sim card', 0.009055073073389305), ('signal', 0.008892775711048989), ('android', 0.008421212476969062), ('burner phone', 0.007457234680741275)]
Topic 60:
[('dream', 0.04780345322326167), ('market', 0.02851567375910358), ('dream market', 0.021027885001330834), ('nightmare', 0.015220044047811104), ('nightmare market', 0.008762844033761142), ('vendor', 0.008604849243005645), ('wall', 0.006555110761713033), ('wall street', 0.006526170536396068), ('know', 0.005485870706052923), ('street', 0.005471691786097412)]
Topic 61:
[('bond', 0.10177283710442345), ('vendor bond', 0.0645675357015474), ('vendor', 0.026816329912906247), ('bond back', 0.01918651694889492), ('market', 0.011368813119234357), ('vendor bond back', 0.01082018159768633), ('waiver', 0.010598727887579623), ('back', 0.0101735165374901), ('dream', 0.008886088502928904), ('revoke', 0.008405409452802836)]
Topic 62:
[('vpn', 0.03721207404553663), ('tor', 0.028200144523501416), ('use', 0.013402410286614562), ('using', 0.01145360349909562), ('proxy', 0.011116435718120654), ('ip', 0.011067690074856605), ('bridge', 0.009428815371334574), ('whonix', 0.00849861359277727), ('tail', 0.007778997175868967), ('wifi', 0.0077301738914062725)]
Topic 63:
[('jabber', 0.06568221603281825), ('telegram', 0.03406161661270426), ('xmpp', 0.020269487823309534), ('pidgin', 0.010132950277145033), ('otr', 0.010003673163673028), ('chat', 0.008972465523778115), ('contact', 0.008639200672884524), ('icq', 0.008581649704795429), ('wickr', 0.008214910677113316), ('use', 0.008173882382069729)]
Topic 64:
[('dmt', 0.03943784555124977), ('psychedelics', 0.011726194406779748), ('per', 0.01041179610055602), ('psychedelic', 0.010151033964077991), ('changa', 0.008416086774018363), ('lsd', 0.007677168530945536), ('per gram', 0.006457964924874002), ('trip', 0.006356085155401695), ('dpt', 0.005506950524119051), ('gram', 0.0053500575059580125)]
Topic 65:
[('captcha', 0.09656725462433895), ('captchas', 0.012764334842504575), ('page', 0.009331447846694622), ('enter', 0.009280100906742538), ('login', 0.008352420125918192), ('link', 0.0083362850516531), ('ddos', 0.008310337584687669), ('new captcha', 0.007996235704904637), ('server', 0.007211289010492238), ('main', 0.007204254953560555)]
Topic 66:
[('sample', 0.08985863159893867), ('free sample', 0.026813152654664357), ('free', 0.01967093399227681), ('review', 0.013881635563949293), ('sample pack', 0.013072755623359127), ('pack', 0.009507943824925567), ('test', 0.008758221342094103), ('order', 0.008686019327337546), ('got sample', 0.007087016403834942), ('send sample', 0.006908904278506461)]
Topic 67:
[('update', 0.03599327759279911), ('issue', 0.024337781885800616), ('problem', 0.021201860728232335), ('working', 0.0178677962827831), ('fixed', 0.017504781395812463), ('work', 0.014561696793193777), ('bug', 0.012522155343030993), ('thanks', 0.012510950547121237), ('fix', 0.012235177681741193), ('resolved', 0.010532026075715574)]
Topic 68:
[('cgmc', 0.11517422150052703), ('invite', 0.016149142891062347), ('vendor', 0.00975325906739795), ('cgmc cgmc', 0.009317452074834528), ('link cgmc', 0.008519101352879303), ('cgmc vendor', 0.008056978363946638), ('cgmc link', 0.007669038631135997), ('invite code', 0.0076208843596996196), ('vendor cgmc', 0.007602924582643514), ('code', 0.007328957920042097)]
Topic 69:
[('apollon', 0.09610398524842736), ('apollon market', 0.02112905550727932), ('market', 0.018316857221605698), ('empire', 0.012997942473084329), ('apollomarket', 0.01015013468224913), ('apollo', 0.008781478946131514), ('apollon apollon', 0.007410858677417857), ('link apollon', 0.006945733597798516), ('vendor', 0.006805570125621652), ('cryptonia', 0.006409877871186156)]
Topic 70:
[('paypal', 0.07486776081126219), ('transfer', 0.03447677602552438), ('account', 0.024357843936208597), ('paypal account', 0.020868133539659585), ('paypal transfer', 0.0156933328704752), ('western union', 0.011269381823496578), ('union', 0.01066870917389136), ('western', 0.010643337130690966), ('bank', 0.008622449835607669), ('money', 0.008393785101440744)]
Topic 71:
[('giveaway', 0.04690508481686191), ('win', 0.044732779618646655), ('number', 0.031916236655454004), ('winner', 0.030093432182916262), ('contest', 0.01422573507511665), ('link', 0.014142870383111109), ('participate', 0.014115706127395133), ('prize', 0.013935607110717028), ('want win', 0.012597843748595948), ('lotto', 0.012304115611045208)]
Topic 72:
[('pm', 0.0919705637803197), ('working link', 0.08640929543281013), ('link', 0.07719608149493504), ('link please', 0.07423393935612033), ('please', 0.06573280674848452), ('pm link', 0.06353974180668615), ('working', 0.056465646756611906), ('please pm', 0.04621454305616516), ('pm working link', 0.036860558980655024), ('working link please', 0.0345568993255329)]
Topic 73:
[('darkfail', 0.07307155604083337), ('link', 0.033110368821672106), ('fail', 0.03110187376265099), ('dark', 0.028364165367065312), ('dark fail', 0.020569297972971728), ('link darkfail', 0.020391324660866086), ('dark dot', 0.01778673667289363), ('dot fail', 0.017433955597652793), ('dot', 0.017183895189746643), ('dark dot fail', 0.01693501789139882)]
Topic 74:
[('empire', 0.06761588145077328), ('market', 0.019858892941620184), ('empire market', 0.019023835670413568), ('nightmare', 0.009894426304964178), ('alphabay', 0.006677620600847024), ('empire nightmare', 0.006539247141980011), ('vendor', 0.006002948098719143), ('like', 0.005358765960814314), ('berlusconi', 0.005082053013545269), ('good', 0.004984210311078282)]
Topic 75:
[('package', 0.014549307748867537), ('pack', 0.01380351124900245), ('delivery', 0.013760247929872379), ('tracking', 0.01324104816679867), ('day', 0.012228225829446172), ('mail', 0.011641967282648245), ('usps', 0.011180846271431651), ('express', 0.009058228471423136), ('informed delivery', 0.008239235797993539), ('informed', 0.00816525606176161)]
Topic 76:
[('bag', 0.0218708850863682), ('dog', 0.017384906118332724), ('seal', 0.015755246324214874), ('mylar', 0.01407109103569447), ('vac', 0.011351482982205308), ('smell', 0.010161446968059509), ('vacuum', 0.008762129552561957), ('xray', 0.008441467188887415), ('vac seal', 0.008333638511708378), ('drug', 0.007992089475664804)]
Topic 77:
[('opsec', 0.11668877308789992), ('opsec opsec', 0.01554962041490848), ('link opsec', 0.013975003224263025), ('opsec link', 0.01293981780024354), ('opsec opsec link', 0.012099592818069934), ('link opsec opsec', 0.012099592818069934), ('bad opsec', 0.01076616235449375), ('good opsec', 0.009723497802276274), ('link', 0.007607459318664828), ('opsec shit', 0.006666792247239865)]
Topic 78:
[('link', 0.060651829802650395), ('working', 0.039012386803310366), ('working link', 0.02655666220290297), ('main link', 0.022587230829691805), ('link working', 0.020711941687457918), ('main', 0.020204598298805037), ('work', 0.016563829550625592), ('url', 0.015201878983812823), ('link work', 0.014313126063959895), ('main link working', 0.009501430879250292)]
Topic 79:
[('money', 0.06181323215596589), ('pay', 0.018659005539347503), ('money back', 0.01323461107711742), ('dollar', 0.012357774692620414), ('get', 0.011781157227327862), ('steal', 0.010991255605847542), ('get money', 0.010413191282585289), ('back', 0.009943197787347936), ('rich', 0.009436178150210075), ('lost', 0.009421302570257506)]
Topic 80:
[('tracking', 0.04555472102835024), ('tracking number', 0.02009531758403454), ('number', 0.014549036297722027), ('order', 0.013055684090096017), ('day', 0.011661415711019042), ('shipped', 0.010827760928672724), ('package', 0.007961884451057675), ('ordered', 0.007350506995967818), ('vendor', 0.007228862511853636), ('week', 0.007208901064337318)]
Topic 81:
[('guide', 0.1058622488515399), ('tutorial', 0.012160453992527414), ('outdated', 0.010882340175124024), ('thanks', 0.01074381203256329), ('method', 0.008827431977342858), ('send guide', 0.008794555771017756), ('would', 0.008486745762208406), ('pm', 0.008296386970606535), ('buy guide', 0.008194617940468009), ('guide thanks', 0.008102773768585148)]
Topic 82:
[('bir', 0.030431042251861707), ('bu', 0.019992220151183366), ('kai', 0.013116515514243202), ('ama', 0.012087598201548474), ('var', 0.010323942766019788), ('sed', 0.010210139475615325), ('mi', 0.010148457478045665), ('icin', 0.009972073479229515), ('ile', 0.009751530831008109), ('na', 0.008945235111295765)]
Topic 83:
[('rc', 0.02982287146604291), ('mxe', 0.02315625476983254), ('dck', 0.017887533331199874), ('rcs', 0.01563631517471704), ('fdck', 0.011763549037549951), ('meopcp', 0.01164856069461237), ('source', 0.01099412876796653), ('apb', 0.009513262458038243), ('apvp', 0.008702505560106271), ('clearnet', 0.008695149677984921)]
Topic 84:
[('cash', 0.024989281993248996), ('btc', 0.017770974763183865), ('bitcoin', 0.015681372697298013), ('coinbase', 0.012921975395309157), ('atm', 0.012578260683459956), ('localbitcoins', 0.01157109157681611), ('buy', 0.011018745907621693), ('id', 0.009259893847082524), ('exchange', 0.009134329659838127), ('coin', 0.008835343822053126)]
Topic 85:
[('olympus', 0.08979050499785406), ('market', 0.015880260411708532), ('fe escrow', 0.013832817525588672), ('olympus market', 0.012593505065819952), ('dream', 0.011944655652398295), ('exit', 0.010788258187562298), ('scam', 0.009409665079203102), ('exit scam', 0.009407647631792313), ('fe', 0.007684524723620794), ('escrow', 0.0072702370411405065)]
Topic 86:
[('log', 0.06422474915902805), ('logged', 0.04192527952672515), ('logging', 0.022990283434282544), ('login', 0.01652927037036023), ('page', 0.012949847216671756), ('able log', 0.012517271565475484), ('session', 0.01048255233700788), ('time', 0.009495776817483492), ('link', 0.009098677744978852), ('able', 0.008451381287102338)]
Topic 87:
[('vacation', 0.1243532907326314), ('vacation mode', 0.06775206787974397), ('mode', 0.06326261686848363), ('back', 0.015508814317507627), ('profile', 0.011061770318865533), ('go vacation', 0.01014733655307739), ('back vacation', 0.009489452089876295), ('week', 0.009382867035983003), ('say vacation', 0.009320569055591406), ('enjoy vacation', 0.009145545396078522)]
Topic 88:
[('message', 0.04414432197547059), ('contact', 0.019522746999436395), ('email', 0.017772270688176033), ('support', 0.016643136394800466), ('send', 0.0164606438615521), ('inbox', 0.011212181960221914), ('error', 0.009298231772226728), ('notification', 0.009264000239955945), ('send message', 0.009167841532151317), ('reply', 0.008241262770995973)]
Topic 89:
[('post', 0.018045092839630367), ('mod', 0.017406891319516997), ('comment', 0.01640970020385483), ('delete', 0.01041705500206938), ('thread', 0.01020197261754005), ('sub', 0.01006376951502788), ('deleted', 0.00863740929365638), ('vote', 0.00820380610934682), ('button', 0.006726094065497259), ('notification', 0.006640762868556889)]
Topic 90:
[('xmr', 0.0669092475757187), ('wallet', 0.018175299363868372), ('deposit', 0.014013020978880235), ('monero', 0.012911155570731827), ('payment id', 0.012158451286322227), ('payment', 0.011987986589863307), ('xmr deposit', 0.011633394534596823), ('issue', 0.011557346665004352), ('withdrawal', 0.010829426301401593), ('btc', 0.010724593241930265)]
Topic 91:
[('image', 0.04253113570575446), ('exif', 0.034948757023671506), ('upload', 0.031545244164665934), ('exif data', 0.02663151850551448), ('data', 0.021111967604445614), ('photo', 0.020069010667575576), ('metadata', 0.018049516183250763), ('remove', 0.01662077099053795), ('linxli', 0.01657701895323466), ('pic', 0.015406910939308648)]
Topic 92:
[('back', 0.023056999364046565), ('hope', 0.02257059129452718), ('welcome back', 0.012167716170912099), ('luck', 0.01171160143062245), ('good', 0.011128215666626022), ('wish', 0.01053877085199741), ('welcome', 0.009867396732727603), ('glad', 0.009696061364257318), ('man', 0.009571918542005018), ('see', 0.00934302002925438)]
Topic 93:
[('review', 0.05488698710015902), ('template', 0.03611504653951919), ('pic', 0.03484510172441907), ('picture', 0.02718264337205347), ('table', 0.015053472049920686), ('link', 0.014548457372617974), ('review template', 0.013368481787026548), ('formatting', 0.011457612800740312), ('thanks', 0.011371258022359988), ('nice', 0.010934469655671165)]
Topic 94:
[('cheer', 0.17782723050984403), ('cheer cheer', 0.03313750862659381), ('cheer mate', 0.020743097105192693), ('mate', 0.019297604735298277), ('anyone', 0.012159072638624669), ('cheer hey', 0.011834824509497788), ('cheer anyone', 0.009999633310289425), ('know', 0.009755921185339814), ('guy', 0.009739012951154587), ('good', 0.008926994603872149)]
Topic 95:
[('bulk', 0.04502433442985361), ('price', 0.012943163189096501), ('kratom', 0.009602391560900982), ('good', 0.008777631038145118), ('kg', 0.008291117362130453), ('vendor', 0.008203677741691846), ('looking', 0.007704495222672032), ('sell', 0.0067834798899101215), ('quality', 0.006616077525810823), ('buy', 0.006480649470015129)]
Topic 96:
[('wallstreet', 0.08505754108884862), ('wall st', 0.041062485092438025), ('wall', 0.03934169521412448), ('st', 0.03632082390982777), ('wallstreetmarket', 0.02839944332964864), ('dream', 0.019738386360316062), ('wallst', 0.018364840601305287), ('wallstreetmarket wallstreetmarket', 0.013525542549461754), ('link wallstreetmarket wallstreetmarket', 0.013525542549461754), ('wallstreetmarket wallstreetmarket link', 0.013525542549461754)]
Topic 97:
[('product', 0.008859635092686282), ('stealth', 0.007784883132987869), ('shipping', 0.007561955964254681), ('quality', 0.006799656439948533), ('price', 0.006701293969498694), ('vendor', 0.006063408023315248), ('order', 0.005941646028733955), ('good', 0.005328542278256317), ('review', 0.005060509065002832), ('great', 0.0041105811430512205)]
Topic 98:
[('listing', 0.06330496289507025), ('list', 0.026445268879938873), ('superlist', 0.016873298530160532), ('vendor', 0.012107259255754574), ('search', 0.011766662573585754), ('favorite', 0.011304866150270206), ('page', 0.010894845688547483), ('would', 0.007658987295871875), ('filter', 0.007650734371256608), ('new listing', 0.007590895999220696)]
Topic 99:
[('fuck', 0.009332787295000313), ('cunt', 0.008628919666811622), ('dick', 0.0077578508734966745), ('fud fud', 0.007587347666109521), ('fud fud fud', 0.007494680004257013), ('lol', 0.007123344783429645), ('mom', 0.006909444083642832), ('fud', 0.006527258636985005), ('gay', 0.006469079071317363), ('nigga', 0.006393293335467678)]
Topic 100:
[('empire', 0.04024698630146066), ('exit', 0.011262080484729623), ('market', 0.01118967819376233), ('scam', 0.009838062547096009), ('exit scam', 0.00747664089443652), ('empire market', 0.006894982521511149), ('people', 0.006090855398548827), ('scamming', 0.005243233612343388), ('nt', 0.005123485585599628), ('support', 0.0051173300559453185)]
Topic 101:
[('protonmail', 0.08211178339203877), ('protonmailcom', 0.06642801608856601), ('email', 0.045128623866492946), ('proton', 0.041636116158511594), ('secmail', 0.02881874524635854), ('proton mail', 0.019596012217489366), ('protonmailch', 0.013667227368601048), ('mail', 0.011224912618594183), ('wickr', 0.01070667478860652), ('deepbay', 0.008934630948789473)]
Topic 102:
[('wallet', 0.03722981938513912), ('node', 0.03156468978562831), ('gui', 0.03060339064446271), ('monero', 0.025405139490458032), ('remote node', 0.02159692779399663), ('remote', 0.02108338800282159), ('cli', 0.017723642011161053), ('daemon', 0.013237828561208032), ('file', 0.011495466370084044), ('tail', 0.010810233568104117)]
Topic 103:
[('multisig', 0.08480605327811323), ('market', 0.015241005327816859), ('transaction', 0.013862519896290632), ('escrow', 0.013379751858737198), ('use multisig', 0.011526064395402906), ('key', 0.011201509577466094), ('wallet', 0.010892398026835106), ('address', 0.010084027715839551), ('cryptonia', 0.009749339052039417), ('multi', 0.00908482160718631)]
Topic 104:
[('bunk', 0.09365677285623863), ('bunk bar', 0.03383180257895733), ('bar', 0.028711576415180802), ('hulk', 0.014181820819401357), ('sent bunk', 0.012503139088942839), ('pack', 0.012307791934504004), ('reship', 0.0099706463380723), ('got', 0.009446613501569797), ('got bunk', 0.009106665503526302), ('sent', 0.008359077368713629)]
Topic 105:
[('mg', 0.020352590679493434), ('benzo', 0.012613394644539077), ('benzos', 0.01186784081191748), ('alprazolam', 0.01086474761967856), ('alp', 0.010444715713972078), ('pill', 0.008460282420480103), ('etizolam', 0.008260463746063979), ('per', 0.007969738355167011), ('bar', 0.007902768993167286), ('xanax', 0.0070572619004457444)]
Topic 106:
[('pelican', 0.19226633530106949), ('bird', 0.044635231337340306), ('pelicanvendor', 0.03941775101696109), ('bigbird', 0.019779007924753626), ('pelicanvendor pelicanvendor link', 0.019461389223608322), ('pelicanvendor pelicanvendor', 0.019461389223608322), ('link pelicanvendor pelicanvendor', 0.019461389223608322), ('link pelicanvendor', 0.019461389223608322), ('pelicanvendor link', 0.019461389223608322), ('pelican bigbird', 0.017087184020164802)]
Topic 107:
[('heinekenexpress', 0.0837508363087823), ('link heinekenexpress', 0.03134510901757307), ('heinekenexpress link', 0.028543273613349372), ('heinekenexpress heinekenexpress', 0.02846064703095021), ('link heinekenexpress heinekenexpress', 0.028188454536539113), ('heinekenexpress heinekenexpress link', 0.028188454536539113), ('link', 0.024165839581671855), ('heineken', 0.022785904470705217), ('review', 0.014392754971946966), ('product', 0.00927336394252171)]
Topic 108:
[('rdp', 0.042411133347710836), ('sock', 0.03560371798033193), ('vpn', 0.014076396682403649), ('ip', 0.013915099190889322), ('card', 0.013451516108376471), ('proxy', 0.013081346253179112), ('use', 0.012219247004167016), ('socks', 0.011639625673788151), ('rdps', 0.011094457542798442), ('carding', 0.009221407358884022)]
Topic 109:
[('dnm', 0.019318824601527097), ('dm', 0.013234343415873292), ('dread', 0.011586188645974847), ('forum', 0.011217882125336184), ('link', 0.010857081257193877), ('dnstars', 0.010723644902855939), ('reddit', 0.010110665301857408), ('sub', 0.009715236189388344), ('dnms', 0.009617115636908079), ('community', 0.008815003762816808)]
Topic 110:
[('pic', 0.08798260552829079), ('picture', 0.044315863117110804), ('photo', 0.023665049954769066), ('photoshop', 0.019886058122954426), ('post pic', 0.018579802444378672), ('post', 0.014231022522399248), ('timestamp', 0.012478973067461723), ('image', 0.011658145309600952), ('please', 0.008630026850004678), ('bud', 0.008029794941935182)]
Topic 111:
[('empire', 0.057661772719845454), ('link', 0.04004491250452288), ('empiremarket', 0.026616080508408132), ('empire link', 0.023238813050905235), ('link empire', 0.019217233101687416), ('working', 0.01807371711893264), ('empire market', 0.016581048320177965), ('forum', 0.015477205920117377), ('empire forum', 0.015477145152006933), ('link empiremarket', 0.0152743125229592)]
Topic 112:
[('invite', 0.1671495519824375), ('invite code', 0.08250556656344046), ('code', 0.0633151087446098), ('need invite', 0.02527243955187877), ('get invite', 0.024790903448788374), ('invitation', 0.013713952497831144), ('get invite code', 0.013404199145471854), ('send invite', 0.013139155639535056), ('register', 0.012530008654303777), ('anyone', 0.012380086709918171)]
Topic 113:
[('samsara', 0.11383384349850058), ('market', 0.019696302588425578), ('samsara market', 0.019104278767296878), ('sam', 0.015574436285877171), ('dream', 0.012914363674112224), ('samsara samsara', 0.011046034136147152), ('cryptonia', 0.01025294794026358), ('support', 0.008635519949188524), ('empire', 0.008377918146973879), ('dream market', 0.007993715281883484)]
Topic 114:
[('chemical', 0.00941712444197636), ('test', 0.00698539734569969), ('lab', 0.00594436195144374), ('powder', 0.005922176294883902), ('product', 0.005365926939192235), ('purity', 0.005184075751997104), ('vendor', 0.005158476901656329), ('sample', 0.0049240993248521785), ('know', 0.004756783735925156), ('research chemical', 0.004673370918831879)]
Topic 115:
[('rapture', 0.11342721626444677), ('rapture market', 0.017375788477946123), ('rapturemarket', 0.013730603814303506), ('market', 0.013212257705240179), ('gbp', 0.010761215706038798), ('tab ug gbp', 0.010008164441943749), ('voidrealm', 0.009949066773080372), ('ug gbp', 0.009649941839423042), ('voidrealm aztec tab', 0.009591157102863436), ('voidrealm aztec', 0.009295294566840948)]
Topic 116:
[('water', 0.011463174046512533), ('acetone', 0.0082890064363493), ('powder', 0.005756210479966381), ('dry', 0.005729933331167575), ('ml', 0.0057174500518880475), ('solution', 0.005116559771873014), ('dissolve', 0.00471606802634953), ('like', 0.004493407278856929), ('speed', 0.00436537897350733), ('solvent', 0.004252693942242391)]
Topic 117:
[('witchman', 0.1329423135487746), ('link witchman', 0.0615947119151041), ('link', 0.05945712771682505), ('witchman link', 0.05790309217242579), ('witchman witchman', 0.05650223596449077), ('link witchman witchman', 0.05546045414048933), ('witchman witchman link', 0.05546045414048933), ('envoy', 0.04236915950823802), ('link link', 0.029091536666547322), ('link link witchman', 0.023245339842409098)]
Topic 118:
[('tochka', 0.12916091988911232), ('market', 0.016623251444736028), ('tochka market', 0.009504055884703176), ('tochka tochka', 0.007013758619830559), ('use tochka', 0.0069322593541119425), ('scam', 0.006779046642263748), ('vendor', 0.006577375776768368), ('honeypot', 0.006359300812408086), ('tochka honeypot', 0.005941936589238809), ('like', 0.005762818821293863)]
Topic 119:
[('post', 0.02079526327100141), ('know guy know', 0.019708694388546215), ('guy know guy', 0.019688155529190104), ('know guy', 0.01699803267269902), ('guy know', 0.01634881468315262), ('read', 0.014316793962685056), ('title', 0.013610259709350484), ('guy', 0.010327350303089605), ('know', 0.008961533825381052), ('thread', 0.0087171069802534)]
Topic 120:
[('subdread', 0.047141772290036604), ('sub', 0.01818068368949251), ('post', 0.015266693761681582), ('subdreads', 0.014886822738452496), ('create', 0.012782270138236513), ('dread', 0.0123603742621837), ('link', 0.012271288165749638), ('forum', 0.00785628393782696), ('create subdread', 0.006895120712568126), ('new', 0.0063622132102890225)]
Topic -1:
[('vendor', 0.004989482902484623), ('order', 0.004538694214898495), ('nt', 0.004483335735564212), ('market', 0.004216268661986249), ('link', 0.004080981455000956), ('get', 0.0037685619353366876), ('like', 0.0036515031685414347), ('time', 0.003367037424678727), ('would', 0.0033535370964552494), ('know', 0.0031630292311466738)]
In [12]:
topic_model.visualize_topics()

120ClusterDistribution.png

In [13]:
sihouette, davies, X = btu.calculate_silhouette_davies(umap_embeddings, new_topics)
coherence_score = btu.evaluate_topic_coherence(topic_model.get_topics(), corpus)
dos_score = btu.calculate_dos(topic_model.get_topics())
Silhouette Score: 0.47547927498817444
Davies-Bouldin Score: 0.6798967697902085
Coherence Model: 0.6931099961796344
Distinta Overlap Score: 0.2429210134128167
In [14]:
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
WARNING:tensorflow:From c:\Users\dommy\miniconda3\envs\gestione\Lib\site-packages\tf_keras\src\losses.py:2976: The name tf.losses.sparse_softmax_cross_entropy is deprecated. Please use tf.compat.v1.losses.sparse_softmax_cross_entropy instead.

In [23]:
zero_shot_topics = pd.read_csv('../../../Datasets/IntentCrime/intent_crime.csv')['intent'].tolist()
dict_zero_shots_25 = btu.assign_labels_to_topics(classifier, topic_model, zero_shot_topics, len(set(new_topics) - {-1}), threshold=.25)
Assigning labels to topics:   0%|          | 0/121 [00:00<?, ?it/s]
Assigning labels to topics: 100%|██████████| 121/121 [1:25:39<00:00, 42.47s/it]
In [36]:
dict_zero_shots_25[0] = 'weed - thc - cart'
dict_zero_shots_25[5] = 'thanks'
dict_zero_shots_25[9] = 'pgp - pgp signature - hash signature'
dict_zero_shots_25[14] = 'drug - police - gun'
dict_zero_shots_25[15] = 'monero - btc - wallet'
dict_zero_shots_25[19] = 'post banned'
dict_zero_shots_25[20] = 'good vendor - bar - hulk'
dict_zero_shots_25[25] = 'xtc - mg - pill'
dict_zero_shots_25[27] = 'review thanks'
dict_zero_shots_25[30] = 'escrow'
dict_zero_shots_25[31] = 'onion link'
dict_zero_shots_25[34] = 'dread - reddit - post'
dict_zero_shots_25[40] = 'hugbunter - link hugbunter'
dict_zero_shots_25[44] = 'empire - dread'
dict_zero_shots_25[45] = 'timing to shipment'
dict_zero_shots_25[46] = 'fake ID - passport'
dict_zero_shots_25[51] = 'dream - dread'
dict_zero_shots_25[64] = 'dmt - lsd - pyschadelic'
dict_zero_shots_25[65] = 'captcha'
dict_zero_shots_25[68] = 'invite - cgmc'
dict_zero_shots_25[71] = 'giveaway'
dict_zero_shots_25[77] = 'opsec - opsec questions'
dict_zero_shots_25[78] = 'working links'
dict_zero_shots_25[85] = 'olympus - scam'
dict_zero_shots_25[86] = 'logged'
dict_zero_shots_25[88] = 'contact info'
dict_zero_shots_25[90] = 'xmr - wallet'
dict_zero_shots_25[93] = 'review template'
dict_zero_shots_25[94] = 'cheer'
dict_zero_shots_25[98] = 'attending list'
dict_zero_shots_25[100] = 'empire site scammer'
dict_zero_shots_25[101] = 'proton email'
dict_zero_shots_25[102] = 'wallet - node'
dict_zero_shots_25[106] = 'pelican - bird'
dict_zero_shots_25[107] = 'heineken express link'
dict_zero_shots_25[109] = 'dreat - dm'
dict_zero_shots_25[110] = 'picture'
dict_zero_shots_25[111] = 'empire link'
dict_zero_shots_25[112] = 'invite code'
dict_zero_shots_25[113] = 'samsara market'
dict_zero_shots_25[117] = 'witchman - witchman link'
In [37]:
btu.save_assigned_labels(dict_zero_shots_25, 'ZeroShotClassificationResultsContent/all-MiniLM-L6-v2_190_20n_8dim/zero_shot_025.csv')
In [20]:
topic_model.set_topic_labels(dict_zero_shots_25)
In [21]:
reduced_embeddings = UMAP(n_neighbors=15, n_components=2, 
                          min_dist=0.0, metric='cosine').fit_transform(embeddings)
In [39]:
topic_model.visualize_documents(corpus, reduced_embeddings=reduced_embeddings, 
                                hide_document_hover=True, hide_annotations=True, custom_labels=True)

120Agglomeration.png

In [40]:
topic_model.visualize_barchart(top_n_topics=250, custom_labels=True, n_words=10, width=400)

120Distribution.png

In [41]:
topic_model.visualize_hierarchy(custom_labels=True)

image-2.png

In [42]:
topic_model.visualize_heatmap(custom_labels=True)

image.png

In [9]:
df['content'] = df['content'].str.lower().dropna()
df.drop_duplicates(subset='content', inplace=True)
df.dropna(subset=['content'], inplace=True)
created_on = df['created_on_post'].tolist()
In [14]:
topics_over_time = topic_model.topics_over_time(corpus, created_on, nr_bins=10)
topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=10, width=1250, height=700, custom_labels=True)
10it [4:30:15, 1621.58s/it]

image.png

In [24]:
results_final = btu.return_dataset(corpus, created_on, embeddings, new_topics, probs, topic_model, umap_embeddings)
print(results_final.shape)
results_final.head()
(169288, 11)
Out[24]:
Document Embedding Topic Probability Created_on Count Name CustomName Representation Representative_Docs UMAP_embedding
0 finally got dream lilxan account confirmed pgp... [-0.05092696, -0.028831957, 0.03425763, -0.008... 9 [0.0019145853509105579, 0.0034859457117316834,... 2019-10-16 2462 9_pgp_begin pgp_begin_pgp signature pgp - pgp signature - hash signature [pgp, begin pgp, begin, pgp signature, signatu... [vendor links last online fri dec utc last onl... [0.18862869, 1.9764707, 6.3155804, -0.32967466...
1 im issues vendor account issues withdrawing cm... [0.0006082218, 0.061474808, 0.017827673, 0.010... 1 [0.004161525996584431, 0.09864712150395494, 0.... 2019-10-30 5365 1_deposit_address_ticket_btc ticket - deposit - address [deposit, address, ticket, btc, wallet, deposi... [missing two deposit week ago big deposit erro... [0.71821356, -0.18454741, 2.907359, -0.1046811...
2 making switch xmr besides xmr hodler favorite ... [-0.063973725, -0.040487997, -0.025320696, -0.... 15 [0.002285542372071366, 0.007466206317565759, 0... 2019-10-16 2995 15_monero_xmr_wallet_btc monero - btc - wallet [monero, xmr, wallet, btc, exchange, bitcoin, ... [mixing bitcoin old school easy get scammed fe... [0.15870488, 0.024624836, 3.1060727, -0.176952...
3 got free cooky cart order cannacreations one e... [-0.021990731, 0.020082157, 0.017604021, 0.002... 0 [0.2930367581669767, 0.0036731910177007464, 0.... 2019-10-16 14083 0_cart_weed_strain_thc weed - thc - cart [cart, weed, strain, thc, bud, price, product,... [general review template general information d... [-0.63843, 0.5993336, 2.1778893, -0.6935897, 1...
4 bg gone either oc look like he even cheaper oc... [-0.13640997, 0.040470857, -0.0007943742, 0.06... 38 [0.005669327905041996, 0.0035937638895178897, ... 2019-10-16 1499 38_pack_week_day_ordered order [pack, week, day, ordered, land, got pack, got... [confirm ordered pm last week messaged saying ... [0.5634161, -0.7750787, 2.1607575, -0.37249324...
In [25]:
results_final.to_parquet('DatasetsContentBERTopic/BERTopic_all-MiniLM-L6-v2_190_20n_8dim.parquet')
In [27]:
topic_model.save("ModelsContent/topic_model_all-MiniLM-L6-v2_190_20n_8dim", serialization="pickle", save_ctfidf=True, save_embedding_model=model)
2024-07-05 02:05:34,710 - BERTopic - WARNING: When you use `pickle` to save/load a BERTopic model,please make sure that the environments in which you saveand load the model are **exactly** the same. The version of BERTopic,its dependencies, and python need to remain the same.
In [26]:
topic_model.save("ModelsContent/topic_model_all-MiniLM-L6-v2_190_20n_8dim_safetensors", serialization="safetensors", save_ctfidf=True, save_embedding_model=model)

LLAMA¶

In [ ]:
# Use llama.cpp to load a Quantized LLM
llm = Llama(model_path="../../../openhermes-2.5-mistral-7b.Q4_K_M.gguf", n_gpu_layers=-1, n_ctx=9096, stop=["Q:", "\n"])
In [ ]:
df = pd.read_parquet('DatasetsContentBERTopic/BERTopic_all-MiniLM-L6-v2_190_20n_8dim.parquet')[['Topic', 'Representation', 'Representative_Docs']]
df = df.drop_duplicates(subset='Topic')
df.sort_values(by='Topic', inplace=True)
df
In [ ]:
#topic_label = btu.process_dataset(llm, df) # 480 minutes 
In [3]:
#topic_label.to_csv('LLAMA/topic_model_all_MiniLM-L6-v2_190_20n_8dim_LLAMA_results.csv', index=False)
new_topics = pd.read_csv('LLAMA/topic_model_all_MiniLM-L6-v2_190_20n_8dim_LLAMA_results.csv')
In [4]:
topic_model.set_topic_labels(new_topics.set_index('Topic')['Label'].to_dict())
In [6]:
topic_model.visualize_barchart(top_n_topics=250, custom_labels=True, n_words=10, width=400)

120Distribution_LLAMA2.png

In [7]:
topics_over_time = topic_model.topics_over_time(tc1.corpus, created_on, 
                                                global_tuning=True, evolution_tuning=True, nr_bins=20)
topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=10, width=1250, height=700, custom_labels=True)
20it [5:36:20, 1009.00s/it]

image.png

In [ ]:
topics_over_time[topics_over_time['Topic'] != -1].sort_values(by=['Topic', 'Timestamp']).to_csv('DatasetsContentBERTopic/BERTopic_all-MiniLM-L6-v2_190_20n_8dim_topics_over_time_20.csv', index=False)
In [9]:
topics_over_time = topic_model.topics_over_time(tc1.corpus, created_on, 
                                                global_tuning=True, evolution_tuning=True, nr_bins=10)
topics_over_time[topics_over_time['Topic'] != -1].sort_values(by=['Topic', 'Timestamp']).to_csv('DatasetsContentBERTopic/BERTopic_all-MiniLM-L6-v2_190_20n_8dim_topics_over_time_10.csv', index=False)    
10it [4:58:13, 1789.35s/it]
In [11]:
topics_over_time = topic_model.topics_over_time(tc1.corpus, created_on, 
                                                global_tuning=True, evolution_tuning=True, nr_bins=5)
topics_over_time[topics_over_time['Topic'] != -1].sort_values(by=['Topic', 'Timestamp']).to_csv('DatasetsContentBERTopic/BERTopic_all-MiniLM-L6-v2_190_20n_8dim_topics_over_time_5.csv', index=False)
5it [4:37:01, 3324.33s/it]
In [10]:
topics_over_time = topic_model.topics_over_time(tc1.corpus, created_on, 
                                                global_tuning=True, evolution_tuning=True, nr_bins=3)
topics_over_time[topics_over_time['Topic'] != -1].sort_values(by=['Topic', 'Timestamp']).to_csv('DatasetsContentBERTopic/BERTopic_all-MiniLM-L6-v2_190_20n_8dim_topics_over_time_3.csv', index=False)    
3it [6:24:22, 7687.55s/it]
In [16]:
topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=5, custom_labels=True)

image.png

In [6]:
topic_model.save("ModelsContent/topic_model_all-MiniLM-L6-v2_190_20n_8dim_safetensors", serialization="safetensors", save_ctfidf=True, save_embedding_model='all-MiniLM-L6-v2')
topic_model.save("ModelsContent/topic_model_all-MiniLM-L6-v2_190_20n_8dim", serialization="pickle", save_ctfidf=True, save_embedding_model='all-MiniLM-L6-v2')
2024-07-28 20:23:46,838 - BERTopic - WARNING: When you use `pickle` to save/load a BERTopic model,please make sure that the environments in which you saveand load the model are **exactly** the same. The version of BERTopic,its dependencies, and python need to remain the same.

Examples¶

In [16]:
topic_model = BERTopic.load("ModelsContent/topic_model_all-MiniLM-L6-v2_190_20n_8dim", embedding_model='all-MiniLM-L6-v2')
In [12]:
sentence = ['recently closed Samsara market']
btu.predict_topic(topic_model, sentence, custom_labels=True)
Batches: 100%|██████████| 1/1 [00:00<00:00,  2.03it/s]
Out[12]:
Topic Probability Label Words Sentence
0 [(samsara, 0.11383384349850058), (market, 0.01... 1.0 Samsara Market [samsara, market, samsara market, sam, dream, ... recently closed Samsara market
1 [(subdread, 0.047141772290036604), (sub, 0.018... 0.0 Subdread creation issues [subdread, sub, post, subdreads, create, dread... recently closed Samsara market
2 [(empire, 0.10987278200488068), (nightmare, 0.... 0.0 Empire Name Search [empire, nightmare, empire empire, find empire... recently closed Samsara market
3 [(onion, 0.09360299836020991), (dot onion, 0.0... 0.0 Onion Links [onion, dot onion, dot, onion link, onion site... recently closed Samsara market
4 [(det, 0.04309335455807283), (er, 0.0412459466... 0.0 Kola;Vendor;Stealth Shipping;Review;Norway [det, er, og, har, jeg, som, ikke, til, en, med] recently closed Samsara market
In [17]:
btu.predict_topic(topic_model, ['i want to buy milk and coffee'], custom_labels=True)
Batches: 100%|██████████| 1/1 [00:00<00:00,  2.87it/s]
Out[17]:
Topic Probability Label Words Sentence
0 [(water, 0.011463174046512533), (acetone, 0.00... 0.017596 Acetone Recrystallization Techniques [water, acetone, powder, dry, ml, solution, di... i want to buy milk and coffee
1 [(product, 0.008859635092686282), (stealth, 0.... 0.015278 Product Review [product, stealth, shipping, quality, price, v... i want to buy milk and coffee
2 [(chemical, 0.00941712444197636), (test, 0.006... 0.015159 Chemistry Research and Supply [chemical, test, lab, powder, product, purity,... i want to buy milk and coffee
3 [(coke, 0.03196509150656362), (cocaine, 0.0241... 0.014840 High Quality Cocaine [coke, cocaine, quality, product, good, cut, p... i want to buy milk and coffee
4 [(cart, 0.006819976279333017), (weed, 0.006434... 0.014655 Product Reviews and Purchases [cart, weed, strain, thc, bud, price, product,... i want to buy milk and coffee
In [18]:
btu.predict_topic(topic_model, ['where i can found some keys account?'], custom_labels=True)
Batches: 100%|██████████| 1/1 [00:00<00:00, 37.35it/s]
Out[18]:
Topic Probability Label Words Sentence
0 [(key, 0.02641300395674321), (pgp, 0.022053264... 1.000000e+00 PGP Key Security [key, pgp, account, pgp key, password, message... where i can found some keys account?
1 [(phishing, 0.054238417410435794), (phishing l... 1.109081e-12 Phishing Detection Techniques [phishing, phishing link, link, phished, phish... where i can found some keys account?
2 [(deposit, 0.032787925045137875), (address, 0.... 4.357905e-13 Empire Deposit & Withdrawal Issues [deposit, address, ticket, btc, wallet, deposi... where i can found some keys account?
3 [(tails, 0.03283439657748927), (tail, 0.024664... 3.935350e-13 Tails;Electrum;Persistent File;USB Installation [tails, tail, usb, electrum, persistent, file,... where i can found some keys account?
4 [(wallet, 0.03722981938513912), (node, 0.03156... 3.831501e-13 Monero Wallet Update [wallet, node, gui, monero, remote node, remot... where i can found some keys account?
In [19]:
btu.predict_topic(topic_model, ['fuck you'], num_classes=3, custom_labels=True)
Batches: 100%|██████████| 1/1 [00:00<00:00, 28.58it/s]
Out[19]:
Topic Probability Label Words Sentence
0 [(thanks, 0.032484060113625514), (thank, 0.020... 9.610843e-01 Friendly Positive Talk [thanks, thank, lol, man, bro, good, nice, wor... fuck you
1 [(fuck, 0.009332787295000313), (cunt, 0.008628... 1.215710e-18 Mom sex;Insults [fuck, cunt, dick, fud fud, fud fud fud, lol, ... fuck you
2 [(back, 0.023056999364046565), (hope, 0.022570... 4.776323e-19 hope recovery [back, hope, welcome back, luck, good, wish, w... fuck you
In [20]:
btu.predict_topic(topic_model, ['how many bitcoins does it cost?'], num_classes=1, custom_labels=True)
Batches: 100%|██████████| 1/1 [00:00<00:00, 35.50it/s]
Out[20]:
Topic Probability Label Words Sentence
0 [(wallet, 0.03722981938513912), (node, 0.03156... 0.011528 Monero Wallet Update [wallet, node, gui, monero, remote node, remot... how many bitcoins does it cost?
In [21]:
btu.predict_topic(topic_model, ['hashish weed 2€'], num_classes=1, custom_labels=True)
Batches: 100%|██████████| 1/1 [00:00<00:00, 26.91it/s]
Out[21]:
Topic Probability Label Words Sentence
0 [(cart, 0.006819976279333017), (weed, 0.006434... 1.0 Product Reviews and Purchases [cart, weed, strain, thc, bud, price, product,... hashish weed 2€
In [22]:
btu.predict_topic(topic_model, ['i want to buy some guns'], num_classes=10, custom_labels=True)
Batches: 100%|██████████| 1/1 [00:00<00:00, 27.07it/s]
Out[22]:
Topic Probability Label Words Sentence
0 [(drug, 0.006367626524558121), (police, 0.0060... 0.015678 Drugs and Police Enforcement [drug, police, get, nt, house, cop, people, la... i want to buy some guns
1 [(counterfeit, 0.028169198429265012), (note, 0... 0.006188 Counterfeit Money Sales [counterfeit, note, euro, bill, pen, paper, pe... i want to buy some guns
2 [(dnm, 0.019318824601527097), (dm, 0.013234343... 0.005972 DNM Reddit Subs [dnm, dm, dread, forum, link, dnstars, reddit,... i want to buy some guns
3 [(box, 0.01507705645092756), (mail, 0.01430918... 0.005853 Mail Delivery Issues [box, mail, package, address, po, po box, lett... i want to buy some guns
4 [(price, 0.04048576410122761), (sale, 0.017703... 0.005852 Sale;Promotional Offers;Good Deals [price, sale, promo, sell, buy, good, cheap, d... i want to buy some guns
5 [(sub, 0.011803117925385692), (post, 0.0113216... 0.005752 Dread Market Forum Rules and Bans [sub, post, mod, banned, link, darkbay, dread,... i want to buy some guns
6 [(vendor, 0.040480309109643724), (good, 0.0095... 0.005739 Vendor Recommendation [vendor, good, anyone, know, legit, looking, n... i want to buy some guns
7 [(feedback, 0.03444248253886573), (review, 0.0... 0.005723 ecommerce feedback [feedback, review, vendor, negative, positive,... i want to buy some guns
8 [(scam, 0.030207670216535733), (scammer, 0.024... 0.005666 Vendor Scams and Detection [scam, scammer, scam scam, scam scam scam, sca... i want to buy some guns
9 [(product, 0.008859635092686282), (stealth, 0.... 0.005576 Product Review [product, stealth, shipping, quality, price, v... i want to buy some guns
In [23]:
btu.predict_topic(topic_model, ['i like child'], num_classes=3, custom_labels=True)
Batches: 100%|██████████| 1/1 [00:00<00:00, 36.45it/s]
Out[23]:
Topic Probability Label Words Sentence
0 [(back, 0.023056999364046565), (hope, 0.022570... 9.383120e-01 hope recovery [back, hope, welcome back, luck, good, wish, w... i like child
1 [(pm, 0.10974551148060345), (interested, 0.053... 6.295266e-04 PM Interested Help Explanation [pm, interested, looking, find, please, need, ... i like child
2 [(fuck, 0.009332787295000313), (cunt, 0.008628... 8.768294e-17 Mom sex;Insults [fuck, cunt, dick, fud fud, fud fud fud, lol, ... i like child
In [25]:
btu.predict_topic(topic_model, ['fake id'], num_classes=3, custom_labels=True)
Batches: 100%|██████████| 1/1 [00:00<00:00, 29.71it/s]
Out[25]:
Topic Probability Label Words Sentence
0 [(id, 0.02876053122171935), (passport, 0.02866... 0.854973 Fake IDs & Documents [id, passport, fake, license, scan, fake id, c... fake id
1 [(counterfeit, 0.028169198429265012), (note, 0... 0.000007 Counterfeit Money Sales [counterfeit, note, euro, bill, pen, paper, pe... fake id
2 [(bank, 0.0375073137008834), (account, 0.01998... 0.000006 Bank Drop Transaction [bank, account, drop, bank drop, cash, bank ac... fake id
In [ ]:
import nbconvert

!jupyter nbconvert --to html show_results_content.ipynb